Introduction
Graffiti represents a persistent challenge to urban neighborhoods, affecting aesthetic quality, public safety perceptions, and property values. Chicago’s 311 service request system provides a comprehensive record of graffiti removal requests across the city. This analysis applies spatial predictive modeling to understand whether we can forecast graffiti concentrations using spatial features and count regression models.
Analysis Goals: - Develop a spatial predictive model using k-nearest neighbor and Local Moran’s I features - Compare Poisson and Negative Binomial regression performance - Validate models using spatial cross-validation (Leave-One-Group-Out) - Test temporal stability using 2018 crime data - Evaluate predictive accuracy against kernel density estimation baseline
Part 1: Data Loading and Exploration
Loading Required Libraries
Code
library (tidyverse)
library (lubridate)
library (sf)
library (sp)
library (raster)
library (spdep)
library (spatstat)
library (ggplot2)
library (gridExtra)
library (viridis)
library (MASS)
library (broom)
library (jsonlite)
library (httr)
theme_set (theme_minimal ())
Downloading Graffiti Data from Chicago 311 Portal
Code
# Chicago 311 API endpoint for graffiti data
api_url <- "https://data.cityofchicago.org/resource/hec5-y4x5.json"
# Query parameters: download all available data
params <- list (
"$limit" = 50000
)
# Execute API request
response <- GET (api_url, query = params)
graffiti_raw <- fromJSON (content (response, "text" ))
graffiti_df <- as_tibble (graffiti_raw)
cat ("Raw Graffiti Data Summary: \n " )
Raw Graffiti Data Summary:
Code
cat ("Total Records:" , nrow (graffiti_df), " \n " )
Code
cat ("Variables:" , ncol (graffiti_df), " \n\n " )
Code
# Check available columns
cat ("Available columns: \n " )
Code
print (colnames (graffiti_df))
[1] "creation_date"
[2] "status"
[3] "completion_date"
[4] "service_request_number"
[5] "type_of_service_request"
[6] "what_type_of_surface_is_the_graffiti_on_"
[7] "where_is_the_graffiti_located_"
[8] "street_address"
[9] "zip_code"
[10] "x_coordinate"
[11] "y_coordinate"
[12] "ward"
[13] "police_district"
[14] "community_area"
[15] "latitude"
[16] "longitude"
[17] "location"
[18] ":@computed_region_awaf_s7ux"
[19] ":@computed_region_6mkv_f3dw"
[20] ":@computed_region_vrxf_vc4k"
[21] ":@computed_region_bdys_3d7i"
[22] ":@computed_region_43wa_7qmu"
[23] "ssa"
Data Cleaning and Preparation
Code
# Clean and prepare graffiti data
graffiti_clean <- graffiti_df %>%
filter (! is.na (longitude) & ! is.na (latitude)) %>%
mutate (
longitude = as.numeric (longitude),
latitude = as.numeric (latitude),
creation_date = as.POSIXct (creation_date),
year = year (creation_date),
month = month (creation_date),
date = as_date (creation_date)
) %>%
filter (longitude > - 88.5 & longitude < - 87.5 ,
latitude > 41.6 & latitude < 42.1 )
cat ("Data Cleaning Summary: \n " )
Code
cat ("Records after cleaning: " , nrow (graffiti_clean), " \n " )
Records after cleaning: 49973
Code
if (nrow (graffiti_clean) > 0 ) {
cat ("Date range: " , min (graffiti_clean$ date, na.rm = TRUE ),
" to " , max (graffiti_clean$ date, na.rm = TRUE ), " \n " )
}
Date range: 17709 to 17883
Code
# A tibble: 5 × 26
creation_date status completion_date service_request_number
<dttm> <chr> <chr> <chr>
1 2018-12-18 00:00:00 Completed 2018-12-19T00:00:00.000 18-03388768
2 2018-12-18 00:00:00 Completed 2018-12-19T00:00:00.000 18-03388476
3 2018-12-18 00:00:00 Open <NA> 18-03386436
4 2018-12-18 00:00:00 Completed 2018-12-19T00:00:00.000 18-03386535
5 2018-12-18 00:00:00 Open <NA> 18-03388464
# ℹ 22 more variables: type_of_service_request <chr>,
# what_type_of_surface_is_the_graffiti_on_ <chr>,
# where_is_the_graffiti_located_ <chr>, street_address <chr>, zip_code <chr>,
# x_coordinate <chr>, y_coordinate <chr>, ward <chr>, police_district <chr>,
# community_area <chr>, latitude <dbl>, longitude <dbl>, location <df[,2]>,
# `:@computed_region_awaf_s7ux` <chr>, `:@computed_region_6mkv_f3dw` <chr>,
# `:@computed_region_vrxf_vc4k` <chr>, `:@computed_region_bdys_3d7i` <chr>, …
Converting to Spatial Object
Code
# Convert to sf object (WGS84)
graffiti_wgs84 <- st_as_sf (graffiti_clean,
coords = c ("longitude" , "latitude" ),
crs = 4326 )
# Project to UTM Zone 16N for accurate distance calculations
graffiti_utm <- st_transform (graffiti_wgs84, crs = 32616 )
cat ("Spatial Object Created: \n " )
Code
cat ("Total features:" , nrow (graffiti_utm), " \n " )
Loading Chicago City Boundary
Code
# Load Chicago boundary from city's open data portal
chicago_url <- "https://data.cityofchicago.org/api/geospatial/igwz-8jzy?method=export&format=GeoJSON"
chicago_boundary <- st_read (chicago_url, quiet = TRUE )
# Project to UTM Zone 16N
chicago_utm <- st_transform (chicago_boundary, crs = 32616 )
cat ("Chicago Boundary Loaded \n " )
Code
cat ("Area:" , round (as.numeric (st_area (chicago_utm))/ 1e6 , 1 ), "km² \n " )
Area: 4.8 9.1 6 6.6 5.3 8.1 8.2 7.1 2.9 11.3 6 8.3 6.5 5 10.2 8.3 9.6 2.6 10.1 3 5.1 9.3 9.3 11.8 18.5 3.4 5 14.7 8.3 11.9 7.6 4.3 4.6 2.6 4.3 1.6 1.8 4.5 2.7 3.9 4.2 5.4 7.6 7.6 3.2 8.7 1.6 4.5 12.5 5.5 28.2 7.7 9.2 9.1 13.5 10.9 5.2 7 3.7 5.4 12.5 3 5.7 6.6 7.6 9.1 8.2 8 9.2 12.6 9.8 8.2 7.4 7 8.5 34.5 4.5 km²
Visualizing Spatial Distribution
Code
# Create base map showing all graffiti points
ggplot () +
geom_sf (data = chicago_utm, fill = NA , color = "black" , linewidth = 0.8 ) +
geom_sf (data = graffiti_utm, color = "#E31C23" , size = 0.8 , alpha = 0.3 ) +
theme_minimal () +
labs (
title = "Spatial Distribution of Graffiti Removal Requests" ,
subtitle = paste ("Total requests:" , nrow (graffiti_utm)),
x = "UTM Easting (m)" ,
y = "UTM Northing (m)"
) +
theme (
plot.title = element_text (size = 14 , face = "bold" ),
plot.subtitle = element_text (size = 11 ),
axis.text = element_text (size = 9 )
)
Temporal Patterns
Code
# Calculate monthly statistics
monthly_stats <- graffiti_clean %>%
group_by (month) %>%
summarise (
count = n (),
avg_per_day = n () / n_distinct (date),
.groups = "drop"
) %>%
mutate (month_name = month.abb[month])
# Visualize monthly distribution
ggplot (monthly_stats, aes (x = reorder (month_name, month), y = count)) +
geom_col (fill = "#E31C23" , alpha = 0.8 , color = "black" , linewidth = 0.3 ) +
geom_text (aes (label = count), vjust = - 0.3 , size = 3.5 , fontface = "bold" ) +
theme_minimal () +
labs (
title = "Monthly Distribution of Graffiti Requests" ,
x = "Month" ,
y = "Number of Requests"
) +
theme (
axis.text.x = element_text (angle = 45 , hjust = 1 ),
plot.title = element_text (size = 13 , face = "bold" )
)
Code
cat (" \n Temporal Summary: \n " )
Code
# A tibble: 7 × 4
month count avg_per_day month_name
<dbl> <int> <dbl> <chr>
1 6 1337 334. Jun
2 7 10078 325. Jul
3 8 10098 326. Aug
4 9 8033 268. Sep
5 10 8808 284. Oct
6 11 7733 258. Nov
7 12 3886 216. Dec
Part 2: Fishnet Grid Creation
Creating 500m × 500m Grid
Code
# Create regular grid covering Chicago
fishnet_full <- st_make_grid (
chicago_utm,
cellsize = 500 ,
square = TRUE ,
what = "polygons"
) %>%
st_as_sf () %>%
mutate (grid_id = row_number ())
# Keep only grid cells intersecting Chicago boundary
fishnet_chicago <- fishnet_full[chicago_utm, ]
cat ("Fishnet Grid Created: \n " )
Code
cat ("Total grid cells:" , nrow (fishnet_chicago), " \n " )
Code
cat ("Cell size: 500m × 500m \n " )
Aggregating Graffiti Counts to Grid
Code
# Aggregate graffiti points to grid cells
graffiti_grid <- fishnet_chicago %>%
st_join (graffiti_utm, join = st_contains) %>%
group_by (grid_id) %>%
summarise (
n_graffiti = n (),
.groups = "drop"
) %>%
mutate (n_graffiti = ifelse (is.na (n_graffiti), 0 , n_graffiti))
cat ("Grid Aggregation Complete: \n " )
Grid Aggregation Complete:
Code
cat ("Total cells:" , nrow (graffiti_grid), " \n " )
Code
cat ("Cells with graffiti:" , sum (graffiti_grid$ n_graffiti > 0 ), " \n " )
Cells with graffiti: 2618
Code
cat ("Mean per cell:" , round (mean (graffiti_grid$ n_graffiti), 2 ), " \n\n " )
Code
print (summary (graffiti_grid$ n_graffiti))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 1.00 3.00 19.47 18.00 403.00
Visualizing Grid Aggregation
Code
# Map showing graffiti counts per grid cell
p1 <- ggplot () +
geom_sf (data = graffiti_grid, aes (fill = n_graffiti), color = NA ) +
geom_sf (data = chicago_utm, fill = NA , color = "black" , linewidth = 0.5 ) +
scale_fill_viridis_c (name = "Requests" , option = "plasma" ) +
theme_minimal () +
labs (
title = "Graffiti Count Aggregation to 500m Grid" ,
x = "UTM Easting (m)" ,
y = "UTM Northing (m)"
) +
theme (plot.title = element_text (size = 13 , face = "bold" ))
# Histogram of counts
p2 <- ggplot (graffiti_grid, aes (x = n_graffiti)) +
geom_histogram (bins = 40 , fill = "#E31C23" , alpha = 0.8 , color = "black" ) +
theme_minimal () +
labs (
title = "Distribution of Graffiti Counts" ,
x = "Requests per Cell" ,
y = "Frequency"
) +
theme (plot.title = element_text (size = 12 , face = "bold" ))
gridExtra:: grid.arrange (p1, p2, ncol = 2 )
Part 3: Spatial Features Construction
Creating Spatial Weights Matrix and Features
Code
# Step 1: Ensure graffiti_grid is sf with grid_id
graffiti_grid <- graffiti_grid %>%
mutate (grid_id = row_number ())
# Step 2: Convert to sp object (required by spdep)
graffiti_sp <- as_Spatial (graffiti_grid)
# Step 3: Create k-nearest neighbors weight matrix
knn_neighbors <- knearneigh (coordinates (graffiti_sp), k = 5 )
knn_weights <- knn2nb (knn_neighbors, sym = TRUE )
knn_weights_std <- nb2listw (knn_weights, style = "W" )
cat ("Spatial Weights Matrix Created (k=5 nearest neighbors) \n " )
Spatial Weights Matrix Created (k=5 nearest neighbors)
Code
# Step 4: Calculate k-NN feature (neighbor mean)
neighbor_lags <- lag.listw (knn_weights_std, graffiti_grid$ n_graffiti)
# Step 5: Calculate Local Moran's I
local_moran <- localmoran (graffiti_grid$ n_graffiti, knn_weights_std)
# Step 6: Calculate distance to hotspot
hotspot_threshold <- quantile (graffiti_grid$ n_graffiti, 0.75 )
hotspots <- graffiti_grid %>% filter (n_graffiti >= hotspot_threshold)
hotspot_center <- st_centroid (st_union (hotspots))
dist_to_hotspot <- as.numeric (st_distance (graffiti_grid, hotspot_center)) / 1000
# Step 7: Add all features to graffiti_grid
graffiti_grid <- graffiti_grid %>%
mutate (
neighbor_mean = neighbor_lags,
local_i = local_moran[, 1 ],
local_i_pval = local_moran[, 5 ],
dist_to_hotspot = dist_to_hotspot,
moran_cluster = case_when (
local_i_pval >= 0.05 ~ "Not significant" ,
local_i >= 0 & n_graffiti >= median (n_graffiti) ~ "High-High" ,
local_i >= 0 & n_graffiti < median (n_graffiti) ~ "Low-Low" ,
local_i < 0 & n_graffiti >= median (n_graffiti) ~ "High-Low" ,
TRUE ~ "Low-High"
)
)
cat (" \n Spatial Features Added: \n " )
Code
cat ("neighbor_mean: Average graffiti count in 5 nearest neighbors \n " )
neighbor_mean: Average graffiti count in 5 nearest neighbors
Code
cat ("local_i: Local Moran's I statistic \n " )
local_i: Local Moran's I statistic
Code
cat ("dist_to_hotspot: Distance to graffiti hotspot (km) \n " )
dist_to_hotspot: Distance to graffiti hotspot (km)
Code
cat ("moran_cluster: Classification of spatial clusters \n " )
moran_cluster: Classification of spatial clusters
Code
Code
feature_summary <- graffiti_grid %>%
st_drop_geometry ()
head (feature_summary, 5 )
# A tibble: 5 × 7
grid_id n_graffiti neighbor_mean local_i local_i_pval dist_to_hotspot
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0.200 0.317 27.9
2 2 1 1 0.200 0.317 28.0
3 3 1 1 0.200 0.273 28.1
4 4 1 1 0.200 0.317 28.2
5 5 1 1 0.200 0.317 28.3
# ℹ 1 more variable: moran_cluster <chr>
Visualizing Hotspots and Coldspots
Code
ggplot () +
geom_sf (data = graffiti_grid, aes (fill = moran_cluster), color = NA ) +
geom_sf (data = chicago_utm, fill = NA , color = "black" , linewidth = 0.5 ) +
scale_fill_manual (
name = "Cluster Type" ,
values = c (
"High-High" = "#d73027" ,
"Low-Low" = "#91bfdb" ,
"High-Low" = "#fee090" ,
"Low-High" = "#a6d96a" ,
"Not significant" = "#f7f7f7"
)
) +
theme_minimal () +
labs (
title = "Local Moran's I: Graffiti Hotspots and Coldspots" ,
x = "UTM Easting (m)" ,
y = "UTM Northing (m)"
) +
theme (plot.title = element_text (size = 13 , face = "bold" ))
Code
cat (" \n Cluster Distribution: \n " )
Code
print (table (graffiti_grid$ moran_cluster))
High-High High-Low Low-High Not significant
304 20 2 2292
Part 4: Count Regression Models
Preparing Data for Modeling
Code
# Prepare data for modeling
model_data <- graffiti_grid %>%
st_drop_geometry () %>%
na.omit ()
cat ("Model Data Prepared: \n " )
Code
cat ("Sample size:" , nrow (model_data), " \n " )
Code
cat ("Dependent variable (n_graffiti): \n " )
Dependent variable (n_graffiti):
Code
print (summary (model_data$ n_graffiti))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 1.00 3.00 19.47 18.00 403.00
Code
cat (" \n Overdispersion check: \n " )
Code
cat ("Mean:" , mean (model_data$ n_graffiti), " \n " )
Code
cat ("Variance:" , var (model_data$ n_graffiti), " \n " )
Code
cat ("Variance/Mean ratio:" , round (var (model_data$ n_graffiti)/ mean (model_data$ n_graffiti), 2 ), " \n " )
Variance/Mean ratio: 87.66
Poisson Regression
Code
# Fit Poisson regression
poisson_model <- glm (
n_graffiti ~ neighbor_mean + local_i + dist_to_hotspot,
family = poisson (link = "log" ),
data = model_data
)
cat ("POISSON REGRESSION RESULTS \n " )
POISSON REGRESSION RESULTS
Code
cat (strrep ("=" , 50 ), " \n\n " )
==================================================
Code
print (summary (poisson_model))
Call:
glm(formula = n_graffiti ~ neighbor_mean + local_i + dist_to_hotspot,
family = poisson(link = "log"), data = model_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.3068979 0.0126373 261.68 <2e-16 ***
neighbor_mean 0.0121075 0.0001187 101.97 <2e-16 ***
local_i 0.0780056 0.0009253 84.30 <2e-16 ***
dist_to_hotspot -0.0971570 0.0010339 -93.97 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 119805 on 2617 degrees of freedom
Residual deviance: 36856 on 2614 degrees of freedom
AIC: 45856
Number of Fisher Scoring iterations: 5
Code
# Extract metrics
poisson_aic <- AIC (poisson_model)
poisson_deviance <- deviance (poisson_model)
poisson_dispersion <- poisson_deviance / df.residual (poisson_model)
cat (" \n Model Fit Statistics: \n " )
Code
cat ("AIC:" , round (poisson_aic, 2 ), " \n " )
Code
cat ("Dispersion statistic:" , round (poisson_dispersion, 3 ), " \n " )
Dispersion statistic: 14.099
Negative Binomial Regression
Code
# Fit Negative Binomial regression
nb_model <- glm.nb (
n_graffiti ~ neighbor_mean + local_i + dist_to_hotspot,
data = model_data
)
cat (" \n NEGATIVE BINOMIAL REGRESSION RESULTS \n " )
NEGATIVE BINOMIAL REGRESSION RESULTS
Code
cat (strrep ("=" , 50 ), " \n\n " )
==================================================
Code
Call:
glm.nb(formula = n_graffiti ~ neighbor_mean + local_i + dist_to_hotspot,
data = model_data, init.theta = 1.254353791, link = log)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.4862158 0.0557367 44.606 <2e-16 ***
neighbor_mean 0.0326243 0.0008968 36.378 <2e-16 ***
local_i -0.0161810 0.0121606 -1.331 0.183
dist_to_hotspot -0.0826818 0.0034258 -24.135 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Negative Binomial(1.2544) family taken to be 1)
Null deviance: 8149.6 on 2617 degrees of freedom
Residual deviance: 2538.9 on 2614 degrees of freedom
AIC: 16295
Number of Fisher Scoring iterations: 1
Theta: 1.2544
Std. Err.: 0.0385
2 x log-likelihood: -16285.0510
Code
# Extract metrics
nb_aic <- AIC (nb_model)
nb_deviance <- deviance (nb_model)
cat (" \n Model Fit Statistics: \n " )
Code
cat ("AIC:" , round (nb_aic, 2 ), " \n " )
Model Comparison
Code
# Compare models
cat (" \n MODEL COMPARISON \n " )
Code
cat (strrep ("=" , 50 ), " \n " )
==================================================
Code
cat ("Poisson AIC:" , round (poisson_aic, 2 ), " \n " )
Code
cat ("Negative Binomial AIC:" , round (nb_aic, 2 ), " \n " )
Negative Binomial AIC: 16295.05
Code
cat ("Difference:" , round (poisson_aic - nb_aic, 2 ), " \n\n " )
Code
if (nb_aic < poisson_aic) {
cat ("Decision: Negative Binomial model preferred (lower AIC) \n " )
selected_model <- nb_model
model_name <- "Negative Binomial"
} else {
cat ("Decision: Poisson model preferred (lower AIC) \n " )
selected_model <- poisson_model
model_name <- "Poisson"
}
Decision: Negative Binomial model preferred (lower AIC)
Code
# Extract coefficients
coef_table <- tidy (selected_model, exponentiate = TRUE , conf.int = TRUE )
cat (" \n\n Selected Model Coefficients (Exponentiated - Incidence Rate Ratios): \n " )
Selected Model Coefficients (Exponentiated - Incidence Rate Ratios):
Code
# A tibble: 4 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 12.0 0.0557 44.6 0 10.6 13.6
2 neighbor_mean 1.03 0.000897 36.4 9.44e-290 1.03 1.04
3 local_i 0.984 0.0122 -1.33 1.83e- 1 0.960 1.01
4 dist_to_hotspot 0.921 0.00343 -24.1 1.07e-128 0.914 0.927
Part 5: Spatial Cross-Validation
Setting Up Spatial Folds
Code
# Create spatial folds
set.seed (123 )
n_folds <- 5
centroids <- st_coordinates (st_centroid (graffiti_grid))
spatial_folds <- kmeans (centroids, centers = n_folds)$ cluster
cv_data <- model_data %>%
mutate (fold = spatial_folds[row_number ()])
cat ("Spatial Cross-Validation Setup: \n " )
Spatial Cross-Validation Setup:
Code
cat ("Number of folds:" , n_folds, " \n " )
Code
Code
print (table (cv_data$ fold))
1 2 3 4 5
497 331 505 610 675
Running Cross-Validation
Code
# Cross-validation loop
cv_results <- tibble ()
for (fold_num in 1 : n_folds) {
train_data <- cv_data %>% filter (fold != fold_num)
test_data <- cv_data %>% filter (fold == fold_num)
# Fit model on training data
if (model_name == "Negative Binomial" ) {
fold_model <- glm.nb (
n_graffiti ~ neighbor_mean + local_i + dist_to_hotspot,
data = train_data
)
} else {
fold_model <- glm (
n_graffiti ~ neighbor_mean + local_i + dist_to_hotspot,
family = poisson (link = "log" ),
data = train_data
)
}
# Predictions
pred <- predict (fold_model, newdata = test_data, type = "response" )
fold_results <- test_data %>%
mutate (
fold = fold_num,
observed = n_graffiti,
predicted = pred,
error = observed - predicted,
abs_error = abs (error),
squared_error = error^ 2
)
cv_results <- bind_rows (cv_results, fold_results)
}
cat ("Cross-Validation Complete: \n " )
Cross-Validation Complete:
Code
cat ("Total predictions:" , nrow (cv_results), " \n " )
Cross-Validation Error Metrics
Code
# Calculate error metrics
mae <- mean (cv_results$ abs_error, na.rm = TRUE )
rmse <- sqrt (mean (cv_results$ squared_error, na.rm = TRUE ))
me <- mean (cv_results$ error, na.rm = TRUE )
cat ("SPATIAL CROSS-VALIDATION RESULTS \n " )
SPATIAL CROSS-VALIDATION RESULTS
Code
cat (strrep ("=" , 50 ), " \n " )
==================================================
Code
cat ("Mean Absolute Error (MAE):" , round (mae, 3 ), " \n " )
Mean Absolute Error (MAE): 44.017
Code
cat ("Root Mean Squared Error (RMSE):" , round (rmse, 3 ), " \n " )
Root Mean Squared Error (RMSE): 383.165
Code
cat ("Mean Error (ME):" , round (me, 3 ), " \n " )
Visualizing CV Results
Code
# Observed vs Predicted
p1 <- ggplot (cv_results, aes (x = observed, y = predicted)) +
geom_point (alpha = 0.5 , color = "#E31C23" ) +
geom_abline (intercept = 0 , slope = 1 , color = "blue" , linetype = "dashed" ) +
theme_minimal () +
labs (
title = "Spatial CV: Observed vs Predicted" ,
x = "Observed" ,
y = "Predicted" ,
subtitle = paste ("MAE =" , round (mae, 2 ))
) +
theme (plot.title = element_text (size = 12 , face = "bold" ))
# Residuals
p2 <- ggplot (cv_results, aes (x = error)) +
geom_histogram (bins = 30 , fill = "#E31C23" , alpha = 0.7 ) +
geom_vline (xintercept = 0 , color = "blue" , linetype = "dashed" ) +
theme_minimal () +
labs (
title = "Residuals Distribution" ,
x = "Prediction Error" ,
y = "Frequency"
) +
theme (plot.title = element_text (size = 12 , face = "bold" ))
gridExtra:: grid.arrange (p1, p2, ncol = 2 )
Part 6: Temporal Validation (2018 Data)
Loading 2018 Crime Data
Code
# Download 2018 crimes data
crime_url <- "https://data.cityofchicago.org/resource/3i3m-jwuy.json"
# Simple query without complex filters
response_2018 <- GET (crime_url, query = list ("$limit" = 50000 ))
crime_2018_raw <- fromJSON (content (response_2018, "text" ))
crime_2018_df <- as_tibble (crime_2018_raw)
cat ("2018 Crime Data Loaded: \n " )
Code
cat ("Total records:" , nrow (crime_2018_df), " \n " )
Code
cat ("Columns:" , ncol (crime_2018_df), " \n " )
Preparing 2018 Data
Code
# Clean 2018 data
crime_2018_clean <- crime_2018_df %>%
filter (! is.na (longitude) & ! is.na (latitude)) %>%
mutate (
longitude = as.numeric (longitude),
latitude = as.numeric (latitude)
) %>%
filter (longitude > - 88.5 & longitude < - 87.5 ,
latitude > 41.6 & latitude < 42.1 )
# Convert to sf
crime_2018_wgs84 <- st_as_sf (crime_2018_clean,
coords = c ("longitude" , "latitude" ),
crs = 4326 )
crime_2018_utm <- st_transform (crime_2018_wgs84, crs = 32616 )
cat ("2018 Data Cleaned: \n " )
Code
cat ("Records:" , nrow (crime_2018_utm), " \n " )
Aggregating 2018 to Same Grid
Code
# Aggregate to grid
crime_2018_grid <- fishnet_chicago %>%
st_join (crime_2018_utm, join = st_contains) %>%
group_by (grid_id) %>%
summarise (
n_crime = n (),
.groups = "drop"
) %>%
mutate (n_crime = ifelse (is.na (n_crime), 0 , n_crime))
cat ("2018 Aggregation: \n " )
Code
cat ("Total cells:" , nrow (crime_2018_grid), " \n " )
Code
cat ("Cells with crimes:" , sum (crime_2018_grid$ n_crime > 0 ), " \n " )
Predicting 2018 with 2017 Model
Code
# Prepare features
crime_2018_features <- crime_2018_grid %>%
st_drop_geometry () %>%
mutate (
observed_2018 = n_crime
) %>%
left_join (
model_data %>% mutate (grid_id = row_number ()),
by = "grid_id"
) %>%
na.omit ()
cat ("2018 Features Prepared: \n " )
Code
cat ("Records:" , nrow (crime_2018_features), " \n " )
Code
# Make predictions using 2017 model
pred_2018 <- predict (selected_model,
newdata = crime_2018_features,
type = "response" )
temporal_results <- crime_2018_features %>%
mutate (
predicted_2018 = pred_2018,
error_2018 = observed_2018 - predicted_2018,
abs_error_2018 = abs (error_2018),
squared_error_2018 = error_2018^ 2
)
cat ("Temporal Validation: \n " )
Code
cat ("Predictions made:" , nrow (temporal_results), " \n " )
Temporal Validation Metrics
Code
# Calculate 2018 metrics
mae_2018 <- mean (temporal_results$ abs_error_2018)
rmse_2018 <- sqrt (mean (temporal_results$ squared_error_2018))
cat ("TEMPORAL VALIDATION (2018) \n " )
TEMPORAL VALIDATION (2018)
Code
cat (strrep ("=" , 50 ), " \n " )
==================================================
Code
cat ("MAE:" , round (mae_2018, 3 ), " \n " )
Code
cat ("RMSE:" , round (rmse_2018, 3 ), " \n " )
Part 7: KDE Baseline Comparison
Calculating KDE Baseline
Code
# Create point pattern
graffiti_coords <- st_coordinates (graffiti_utm)
# Create window
window <- owin (xrange = range (graffiti_coords[, 1 ]),
yrange = range (graffiti_coords[, 2 ]))
# Create point pattern
graffiti_ppp <- ppp (x = graffiti_coords[, 1 ],
y = graffiti_coords[, 2 ],
window = window)
# Calculate KDE
kde_2017 <- density (graffiti_ppp, sigma = bw.diggle (graffiti_ppp))
kde_raster <- raster (kde_2017)
# Extract to grid
grid_centroids <- st_centroid (graffiti_grid)
kde_values <- raster:: extract (kde_raster, as_Spatial (grid_centroids))
# Normalize
kde_grid_counts <- kde_values * (500 * 500 ) / sum (kde_values, na.rm = TRUE ) *
sum (graffiti_grid$ n_graffiti)
# Calculate errors
kde_errors <- graffiti_grid %>%
st_drop_geometry () %>%
mutate (
predicted_kde = kde_grid_counts,
error_kde = n_graffiti - predicted_kde,
abs_error_kde = abs (error_kde),
squared_error_kde = error_kde^ 2
) %>%
na.omit ()
mae_kde <- mean (kde_errors$ abs_error_kde)
rmse_kde <- sqrt (mean (kde_errors$ squared_error_kde))
cat ("KDE BASELINE \n " )
Code
cat (strrep ("=" , 50 ), " \n " )
==================================================
Code
cat ("MAE:" , round (mae_kde, 3 ), " \n " )
Code
cat ("RMSE:" , round (rmse_kde, 3 ), " \n " )
Part 8: Final Model Comparison
Comprehensive Comparison
Code
# Compare all methods
comparison <- tibble (
Method = c (
paste ("Spatial CV -" , model_name),
"Temporal (2018)" ,
"KDE Baseline"
),
MAE = c (mae, mae_2018, mae_kde),
RMSE = c (rmse, rmse_2018, rmse_kde)
)
cat ("FINAL MODEL COMPARISON \n " )
Code
cat (strrep ("=" , 50 ), " \n " )
==================================================
Code
# A tibble: 3 × 3
Method MAE RMSE
<chr> <dbl> <dbl>
1 Spatial CV - Negative Binomial 44.0 383.
2 Temporal (2018) 41.6 201.
3 KDE Baseline 5204537. 13632995.
Code
cat (" \n Improvement Over KDE Baseline: \n " )
Improvement Over KDE Baseline:
Code
cat ("Spatial CV:" , round ((1 - mae/ mae_kde)* 100 , 1 ), "% \n " )
Code
cat ("Temporal:" , round ((1 - mae_2018/ mae_kde)* 100 , 1 ), "% \n " )
Code
# Visualize comparison
comparison_long <- comparison %>%
pivot_longer (cols = c (MAE, RMSE), names_to = "Metric" , values_to = "Value" )
ggplot (comparison_long, aes (x = Method, y = Value, fill = Metric)) +
geom_col (position = "dodge" , alpha = 0.8 , color = "black" ) +
geom_text (aes (label = round (Value, 2 )), position = position_dodge (width = 0.9 ),
vjust = - 0.3 , size = 3.5 ) +
theme_minimal () +
labs (
title = "Final Model Comparison: Error Metrics" ,
y = "Error (lower is better)"
) +
scale_fill_brewer (palette = "Set2" ) +
theme (
plot.title = element_text (size = 13 , face = "bold" ),
axis.text.x = element_text (angle = 30 , hjust = 1 )
)
Summary and Conclusions
Key Findings
Code
cat ("ANALYSIS SUMMARY \n " )
Code
cat (strrep ("=" , 50 ), " \n\n " )
==================================================
Code
cat ("1. DATA OVERVIEW \n " )
Code
cat (" • Graffiti requests analyzed:" , nrow (graffiti_utm), " \n " )
• Graffiti requests analyzed: 49973
Code
cat (" • Grid cells created:" , nrow (graffiti_grid), " \n " )
• Grid cells created: 2618
Code
cat (" • Cells with graffiti:" , sum (graffiti_grid$ n_graffiti > 0 ), " \n\n " )
• Cells with graffiti: 2618
Code
cat ("2. SPATIAL PATTERNS \n " )
Code
cat (" • Significant clusters identified:" ,
sum (graffiti_grid$ local_i_pval < 0.05 ), " \n " )
• Significant clusters identified: 326
Code
cat (" • Hotspot areas (High-High):" ,
sum (graffiti_grid$ moran_cluster == "High-High" ), " \n " )
• Hotspot areas (High-High): 304
Code
cat (" • Coldspot areas (Low-Low):" ,
sum (graffiti_grid$ moran_cluster == "Low-Low" ), " \n\n " )
• Coldspot areas (Low-Low): 0
Code
cat ("3. MODEL PERFORMANCE \n " )
Code
cat (" • Selected model:" , model_name, " \n " )
• Selected model: Negative Binomial
Code
cat (" • Spatial CV MAE:" , round (mae, 3 ), " \n " )
Code
cat (" • Spatial CV RMSE:" , round (rmse, 3 ), " \n " )
• Spatial CV RMSE: 383.165
Code
cat (" • Improvement over KDE:" , round ((1 - mae/ mae_kde)* 100 , 1 ), "% \n\n " )
• Improvement over KDE: 100 %
Code
cat ("4. TEMPORAL STABILITY \n " )
Code
cat (" • 2018 predictions MAE:" , round (mae_2018, 3 ), " \n " )
• 2018 predictions MAE: 41.618
Code
if (mae_2018 < mae * 1.3 ) {
cat (" • Model shows good temporal stability \n " )
} else {
cat (" • Model shows temporal degradation \n " )
}
• Model shows good temporal stability
Code
cat (" \n 5. SPATIAL FEATURES IMPORTANCE \n " )
5. SPATIAL FEATURES IMPORTANCE
Code
for (i in 2 : nrow (coef_table)) {
term <- coef_table$ term[i]
irr <- coef_table$ estimate[i]
pval <- coef_table$ p.value[i]
sig <- ifelse (pval < 0.05 , "***" , "" )
cat (" •" , term, "(IRR =" , round (irr, 3 ), ") " , sig, " \n " )
}
• neighbor_mean (IRR = 1.033 ) ***
• local_i (IRR = 0.984 )
• dist_to_hotspot (IRR = 0.921 ) ***
Report Generated: 2025-11-16 01:37:30
Course: MUSA 5080 - Public Policy Analytics
Assignment: Lab 4 - Spatial Predictive Analysis
Institution: University of Pennsylvania